# -*- encoding: utf-8 -*-
"""
@File    :   find_city.py
@Time    :   2020/07/13 15:31:55
@Author  :   Song Zewen
@Version :   1.0
@Contact :   stg1205@163.com
@License :   (C)Copyright 2020-2021, Liugroup-NLPR-CASIA
@Desc    :   爬下来之后发现虽然按城市搜索，但网站会把非常多的全国职位
             也放出来，每个城市都有，会有很多重复，正好保存了url，把
             详情页的城市扒下来
"""


import .crawler_util as cu
import xlrd
from xlutils.copy import copy
import os
from bs4 import BeautifulSoup


dir_path = '../datasets/dataset/'


if __name__ == "__main__":
    
    for f in os.listdir(dir_path):
        # print(book)
        file_path = os.path.join(dir_path, f)
        book = xlrd.open_workbook(file_path)
        sheet = book.sheet_by_index(0)
        urls = sheet.col_values(10)[1:]
        locs = []
        
        # for i in range(len(urls)):
        for i in range(len(urls)):
            
            html = cu.get_html(urls[i])
            if html:
                soup = BeautifulSoup(html, 'html.parser')
                loc = soup.select('.job_position')
                if loc:
                    locs.append(loc[0].text)
                    print('----------------get {}th location! of {}---------------'.format(i, f))
                else:
                    locs.append(' ')
            else:
                locs.append(' ')
            # print(locs)
        
        new_book = copy(book)
        new_sheet = new_book.get_sheet(0)
        new_sheet.write(0, 12, '位置')
        
        for i in range(len(locs)):
            new_sheet.write(i + 1, 12, locs[i])
        
        new_book.save(file_path)
        print('------------------save {}!--------------------'.format(file_path))
